c( # I use linux, this saves my time loading/installing packages
"ProjectTemplate", "here", "stringr",
"dplyr", "knitr", "ggplot2", "tabplot",
"patchwork", "tableone", "tidyr", "visdat", "glmnet",
"doParallel", "kableExtra", "dtplyr", "purrr",
"kableExtra"
) |>
lapply(function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dependencies = TRUE)
library(x)
}
})[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
NULL
[[5]]
NULL
[[6]]
NULL
[[7]]
NULL
[[8]]
NULL
[[9]]
NULL
[[10]]
NULL
[[11]]
NULL
[[12]]
NULL
[[13]]
NULL
[[14]]
NULL
[[15]]
NULL
[[16]]
NULL
[[17]]
NULL
set.seed(42) # setting seed for stochastic functions
setwd("..") # here()) # needed as we are in /src, in linux here() should be used
load.project() # Loading the projectProject name: 2023-retention-ABCD
Loading project configuration
Autoloading helper functions
Running helper script: globals.R
Running helper script: helpers.R
Running helper script: pclean.R
Autoloading data
Munging data
Running preprocessing script: 01-A.R
pclean()[1] "Hey! Cleaning files at src folder, look in reports for them."
# R options
options(
digits = 3, # Only two decimal digits
scipen = 999, # Remove scientific notation
width = 100
)
# Knitr options
knitr::opts_chunk$set(
comment = NA, # remove comment symbol
cache.path = "../cache/", # where should I save cache?
fig.path = "../graphs/", # where should I save figures?
echo = T, # dont echo by default
cache = F, # dont cache by default
fig.width = 10, # setting the best witdth for figures
fig.height = 7, # best height
dpi = 300, # high dpi for publication quality,
error = FALSE # do not interrupt in case of errors
)
cb_palette <- c(
"#999999", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7"
)
theme_luis <- function() {
return_theme <- ggplot2::theme_bw(12) +
ggplot2::theme(
panel.border = element_rect(colour = "black"),
legend.background = element_rect(linetype = 1, size = 0.2, colour = 1)
)
}
# register parallel backend
cl <- makeCluster(detectCores()-3, outfile = "")
registerDoParallel(cl)retention <- read.csv("~/Downloads/dataIndiv_ID_2023-03-08.csv") %>%
filter(event == "Baseline" | event == "1 Year") %>%
select(pguid, event, visit_status_aggr, income_household, site, race_ethnicity, education) %>%
mutate(missed = ifelse(visit_status_aggr == "Missed (explained)" |
visit_status_aggr == "Missed (unexplained)", 0, 1),
site = as.factor(site),
race_ethnicity = factor(race_ethnicity),
education = factor(education, ordered = T),
income_household = factor(income_household, ordered = T)
)
# below is a version with wide format, which is the correct way, but there is an oddity in
# the data that the values for some demographic info only appear at the follow up, not at the baseline
# so I suppose the data was joined incorrectly.
# retention_wide <- dataIndiv_ID_2023.03.08 %>%
# select(pguid, event, visit_status_aggr, income_household, site, race_ethnicity, education) %>%
# group_by(pguid) %>%
# filter(event == "Baseline" | event == "1 Year") %>%
# mutate(missed = ifelse(visit_status_aggr == "Missed (explained)" |
# visit_status_aggr == "Missed (unexplained)", 1, 0)) %>%
# pivot_wider(names_from = event, values_from = c(missed, income_household, site, race_ethnicity, education)) %>%
# mutate(`missed_1 Year` = ifelse(is.na(`missed_1 Year`), 0, `missed_1 Year`))
# retention %>%
# group_by(event) %>%
# summarise(n = n(),
# n_missed = sum(missed_visit_1y),
# perc_missed = round(n_missed/n, 2)) |>
# kable(align = "c", caption = "Number of participants and percentage of missed visits by event") |>
# kable_styling()
# # About 5% of participants missed their 1 year visit# Initialize empty vectors for storing results
lambda_values <- c()
RMSE_values <- c()
var_imp_avg <- NULL
# Iterate over the sites
for (isite in unique(retention$site)) {
# Split retention into train and test
train <- retention %>%
filter(site != isite) # Leave out the current site
test <- retention %>%
filter(site == isite) # Use only the current site
# Prepare dataset in glmnet format for train and test sets
outcome_train <- data.matrix(train$missed)
outcome_test <- data.matrix(test$missed)
predictors_train <- train %>%
select(income_household, race_ethnicity, education) %>%
data.matrix()
predictors_test <- test %>%
select(income_household, race_ethnicity, education) %>%
model.matrix(~ ., data = .)
message("Site: ", isite)
# Perform cross-validated glmnet model fitting
cv_model <- cv.glmnet(predictors_train, outcome_train,
nfolds = 21, # Use LOOCV
family = "binomial",
parallel =T)
# Get the best lambda
best_lambda <- cv_model$lambda.1se
# Fit the model using the best lambda on the test set
fit <- glmnet(predictors_test, outcome_test,
alpha = 1,
type.measure = "mse",
lambda = best_lambda,
family = "binomial",
intercept = FALSE)
var_imp <- broom::tidy(fit)
# Store the results
RMSE <- cv_model$cvm[cv_model$lambda == cv_model$lambda.1se] %>% sqrt() %>% round(2)
lambda <- cv_model$lambda.1se %>% round(2)
if (is.null(var_imp_avg)) {
var_imp_avg <- var_imp |>
mutate(site = isite)
} else {
# var_imp_avg <- full_join(var_imp_avg, var_imp, by = c("class","term"))
var_imp_avg <- var_imp |>
mutate(site = isite) |>
bind_rows(var_imp_avg)
lambda_values <- lambda_values %>% append(lambda)
RMSE_values <- RMSE_values %>% append(RMSE)
}
}Site: OHSU
Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one multinomial or
binomial class has fewer than 8 observations; dangerous ground
Site: UCSD
Site: YALE
Site: ROC
Site: WUSTL
Site: UMB
Site: UVM
Site: FIU
Site: UFL
Site: LIBR
Site: UMN
Site: VCU
Site: UTAH
Site: CUB
Site: MUSC
Site: CHLA
Site: UMICH
Site: UCLA
Site: SRI
Site: UPMC
Site: UWM
a <- var_imp_avg %>%
# filter(class == 0) %>%
group_by(term) %>%
summarise(avg_coef = mean(estimate),
n = n(),
std_error = sd(estimate) / sqrt(n)) %>%
mutate(lower = avg_coef - 1.96 * std_error,
upper = avg_coef + 1.96 * std_error
) %>%
drop_na() # will be only keeping vars that consistently appear in all sitesa %>%
select(term, avg_coef, lower, upper) %>%
# decreasing avg_coef
arrange(desc(avg_coef)) %>%
#mutate(Importance = round(Importance, 2)) %>%
kableExtra::kbl(booktabs = T,
col.names = c("Factor", "mean coeff.", "95% CI - lower", "95% CI - upper"),
escape = TRUE) %>%
kable_styling(latex_options = c("striped", "hold_position", "repeat_header"),
full_width = F)| Factor | mean coeff. | 95% CI - lower | 95% CI - upper |
|---|---|---|---|
| education.L | 3.963 | 2.766 | 5.160 |
| race_ethnicityWhite | 2.573 | 2.234 | 2.913 |
| race_ethnicityHispanic | 2.099 | 1.801 | 2.396 |
| race_ethnicityOther | 1.786 | 1.524 | 2.048 |
| race_ethnicityBlack | 1.775 | 1.437 | 2.113 |
| education^10 | 1.469 | 0.974 | 1.964 |
| education^9 | 1.071 | 0.817 | 1.325 |
| race_ethnicityUnknown | 0.885 | 0.684 | 1.085 |
| education^17 | 0.597 | -0.451 | 1.644 |
| education^5 | 0.568 | 0.010 | 1.126 |
| education^11 | 0.242 | -0.160 | 0.644 |
| education^14 | -0.246 | -0.623 | 0.132 |
| income_household.Q | -0.371 | -0.672 | -0.071 |
| education^19 | -0.381 | -0.834 | 0.072 |
| education^20 | -0.479 | -1.232 | 0.274 |
| income_household.L | -0.511 | -0.858 | -0.163 |
| education^21 | -0.787 | -1.402 | -0.171 |
| education^13 | -0.827 | -1.219 | -0.435 |
| education.C | -1.602 | -2.132 | -1.072 |
# generate a point-in-range plot similar to the table above
a %>%
ungroup() %>%
# arrange by avg_coef
ggplot(aes(x = term, y = avg_coef)) +
geom_point() +
geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2) +
geom_hline(yintercept = 0, linetype = "dashed") +
coord_flip() +
labs(x = "Factor", y = "Mean coefficient", title = "Point-in-range plot of the mean coefficient of each factor") +
theme_bw() plot(cv_model)It does not work too well in the Leave-one-site-out cv, figure out how to provide this plot in this case.
plot(fit, xvar = "dev", label = TRUE)Looks weird in the binomial case, ok in the multinomial. Why? Results are identical.
plot(cv_model)plot(fit, xvar = "dev", label = TRUE)library(tabplot)
tableplot(retention)library(visdat)
vis_miss(retention)Category names?
dataIndiv_ID_2023.03.08 %>%
filter(kbi_gender %in% c(1, 2, 3)) %>%
ggplot(aes(as.numeric(interview_age), fill = as.factor(kbi_gender))) +
geom_density(alpha = .3) +
labs(
title = "Age Male x Female", x = "Age",
y = "Density"
) +
theme_luis()Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2
3.4.0.
ℹ Please use the `linewidth` argument instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning
was generated.
Warning: Removed 12727 rows containing non-finite values (`stat_density()`).
project.info$config
$config$version
[1] "0.10.2"
$config$data_loading
[1] TRUE
$config$data_loading_header
[1] TRUE
$config$data_ignore
[1] ""
$config$cache_loading
[1] TRUE
$config$recursive_loading
[1] FALSE
$config$munging
[1] TRUE
$config$logging
[1] FALSE
$config$logging_level
[1] "INFO"
$config$load_libraries
[1] FALSE
$config$libraries
[1] "reshape2, plyr, tidyverse, stringr, lubridate"
$config$as_factors
[1] FALSE
$config$tables_type
[1] "tibble"
$config$attach_internal_libraries
[1] FALSE
$config$cache_loaded_data
[1] TRUE
$config$sticky_variables
[1] "NONE"
$config$underscore_variables
[1] TRUE
$config$cache_file_format
[1] "RData"
$helpers
[1] "globals.R" "helpers.R" "pclean.R"
sessionInfo()R version 4.2.2 (2022-10-31)
Platform: x86_64-solus-linux-gnu (64-bit)
Running under: Solus 4.3 Fortitude
Matrix products: default
BLAS: /usr/lib64/haswell/libopenblas_haswellp-r0.3.20.so
LAPACK: /usr/lib64/R/lib/libRlapack.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=pt_BR.UTF-8
[4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=pt_BR.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=pt_BR.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=pt_BR.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods base
other attached packages:
[1] rmarkdown_2.19 purrr_1.0.1 dtplyr_1.2.2 kableExtra_1.3.4
[5] doParallel_1.0.17 iterators_1.0.14 foreach_1.5.2 glmnet_4.1-6
[9] Matrix_1.5-1 visdat_0.6.0 tidyr_1.3.0 tableone_0.13.2
[13] patchwork_1.1.2 tabplot_1.4.1 ffbase_0.13.2 ff_4.0.9
[17] bit_4.0.5 ggplot2_3.4.0 knitr_1.41 dplyr_1.1.2
[21] stringr_1.5.0 here_1.0.1 ProjectTemplate_0.10.2 tibble_3.2.1
[25] digest_0.6.31 nvimcom_0.9-131
loaded via a namespace (and not attached):
[1] Rcpp_1.0.9 svglite_2.1.1 lattice_0.20-45 rprojroot_2.0.3 utf8_1.2.2
[6] R6_2.5.1 backports_1.4.1 survey_4.1-1 evaluate_0.19 highr_0.10
[11] httr_1.4.4 pillar_1.9.0 rlang_1.1.0 rstudioapi_0.14 data.table_1.14.6
[16] jquerylib_0.1.4 labeling_0.4.2 splines_4.2.2 webshot_0.5.4 munsell_0.5.0
[21] broom_1.0.2 compiler_4.2.2 xfun_0.39 pkgconfig_2.0.3 systemfonts_1.0.4
[26] shape_1.4.6 htmltools_0.5.4 mitools_2.4 tidyselect_1.2.0 codetools_0.2-18
[31] fansi_1.0.3 viridisLite_0.4.1 crayon_1.5.2 withr_2.5.0 grid_4.2.2
[36] jsonlite_1.8.4 gtable_0.3.1 lifecycle_1.0.3 DBI_1.1.3 magrittr_2.0.3
[41] scales_1.2.1 cachem_1.0.6 cli_3.6.0 stringi_1.7.12 farver_2.1.1
[46] bslib_0.4.2 xml2_1.3.3 generics_0.1.3 vctrs_0.6.1 fastmatch_1.1-3
[51] tools_4.2.2 glue_1.6.2 yaml_2.3.6 fastmap_1.1.0 survival_3.4-0
[56] colorspace_2.0-3 rvest_1.0.3 sass_0.4.6
Post-doc T32. luis.araujo@vcuhealth.org
↩